Pre-processing

Let’s load the datasets.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000     ✔ purrr   0.2.4     
## ✔ tibble  1.4.2          ✔ dplyr   0.7.4     
## ✔ tidyr   0.7.2          ✔ stringr 1.3.0     
## ✔ readr   1.1.1          ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::vars()   masks ggplot2::vars()
user_stats <- read_csv("./data/user-stats.csv") %>%
  filter(Valid == "Yes") %>%
  mutate(screen_name = tolower(`twitter account`)) %>%
  select(screen_name, LABEL) %>% #remove userlevel, e.g., followersCount, favoritesCount, friendsCount 
  mutate_if(is.numeric, log) 
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   statusesCount = col_integer(),
##   followersCount = col_integer(),
##   favoritesCount = col_integer(),
##   friendsCount = col_integer(),
##   created = col_datetime(format = ""),
##   protected = col_logical(),
##   verified = col_logical(),
##   id = col_double(),
##   listedCount = col_integer(),
##   followRequestSent = col_logical()
## )
## See spec(...) for full column specifications.
t <- read_csv("./data/full-tweets-14days.csv") %>%
  mutate(Date = format(created_at, tz="America/New_York"),
         text = iconv(text, to="UTF-8"),
         screen_name = tolower(screen_name),
         DateFlag = case_when(
           Date < "2017-05-30" ~ "Train",
         TRUE ~ "Test"
         )) %>%
  inner_join(user_stats, by = "screen_name")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   retweet_count = col_double(),
##   favorited = col_logical(),
##   truncated = col_logical(),
##   id_str = col_double(),
##   retweeted = col_logical(),
##   created_at = col_datetime(format = ""),
##   in_reply_to_status_id_str = col_double(),
##   in_reply_to_user_id_str = col_double(),
##   listed_count = col_double(),
##   verified = col_logical(),
##   user_id_str = col_double(),
##   geo_enabled = col_logical(),
##   user_created_at = col_datetime(format = ""),
##   statuses_count = col_double(),
##   followers_count = col_double(),
##   favourites_count = col_double(),
##   protected = col_logical(),
##   utc_offset = col_double(),
##   friends_count = col_double()
## )
## See spec(...) for full column specifications.
count <- count(t, screen_name, sort = TRUE)

# remove accounts with less than 40 tweets through the 2 weeks
t <- filter(t, !(screen_name %in% count$screen_name[count$n < 40]))

Text Features

library(textfeatures)

## standardize function
scale_standard <- function(x) (x - 0) / (max(x, na.rm = TRUE) - 0)

## convert to long (tidy) form and plot
p <- group_by(t, screen_name, DateFlag) %>%
  textfeatures() %>%
  inner_join(user_stats, by = "screen_name") %>%
  ungroup()
  #%>%  mutate_if(is.numeric, scale_standard) 
table(p$LABEL)
## 
##  clickbait       hoax propaganda   realnews     satire 
##         62          6        156         62         18
p <- p %>% 
  mutate(y = case_when(LABEL == "realnews" ~ 1L,
                        TRUE ~ 0L)) %>%
  mutate(y = as.factor(y)) %>%
  select(-LABEL)

Lime & Caret

# Split up the data set
set.seed(3456)

train <- filter(p, DateFlag == "Train") %>% select(-screen_name, -DateFlag)
test <- filter(p, DateFlag == "Test") %>% select(-screen_name, -DateFlag)

model <- randomForest::randomForest(y ~ ., data = train, xtest = test[,-18], ytest = test$y, localImp = TRUE)

model
## 
## Call:
##  randomForest(formula = y ~ ., data = train, xtest = test[, -18],      ytest = test$y, localImp = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 9.21%
## Confusion matrix:
##     0  1 class.error
## 0 116  5  0.04132231
## 1   9 22  0.29032258
##                 Test set error rate: 3.29%
## Confusion matrix:
##     0  1 class.error
## 0 120  1 0.008264463
## 1   4 27 0.129032258

Random Forest Explainer

https://mi2datalab.github.io/randomForestExplainer/

library(randomForestExplainer)

# rerun without test for rfForest
model <- randomForest::randomForest(y ~ ., data = train, localImp = TRUE)

Min Depth

min_depth_frame <- min_depth_distribution(model)
plot_min_depth_distribution(min_depth_frame)

Var Importance

importance_frame <- measure_importance(model)

knitr::kable(importance_frame)
variable mean_min_depth no_of_nodes accuracy_decrease gini_decrease no_of_trees times_a_root p_value
n_caps 2.606790 404 0.0190791 4.2742470 282 64 0.0000045
n_capsp 1.867438 465 0.0435479 8.4636211 324 114 0.0000000
n_chars 3.673252 253 0.0040539 1.5851752 200 19 0.9999870
n_charsperword 3.958493 229 0.0036038 1.2966192 183 14 1.0000000
n_commas 3.651820 268 0.0051611 1.5784511 210 19 0.9994925
n_digits 3.617910 293 0.0011254 1.2254618 233 0 0.9630694
n_exclaims 1.968170 533 0.0250244 5.2892613 377 55 0.0000000
n_extraspaces 3.490928 324 0.0046426 1.8361239 242 6 0.4932735
n_hashtags 3.128891 384 0.0167857 3.1999076 278 11 0.0003856
n_lowers 2.658716 367 0.0169866 4.4457087 260 74 0.0074937
n_lowersp 1.954313 469 0.0379129 7.4267165 330 98 0.0000000
n_mentions 3.658568 275 0.0051994 1.6103046 217 2 0.9978979
n_nonasciis 3.895066 235 0.0004246 0.8781595 197 1 1.0000000
n_periods 3.872637 260 0.0025483 1.2706910 204 7 0.9999200
n_puncts 3.762780 270 0.0041782 1.6757341 206 0 0.9992242
n_urls 4.188138 197 0.0004920 0.8991808 161 2 1.0000000
n_words 3.605517 271 0.0069132 1.7665049 217 14 0.9990461
importance_frame %>%
  mutate(variable = as.factor(variable)) %>%
  ggplot(aes(x = forcats::fct_reorder(variable, accuracy_decrease), y = accuracy_decrease)) +
  geom_col() + 
  coord_flip() +
  labs(title = "Feature Importance: Decrease in Accuracy",
       x = "Decrease in Accuracy after Removing Feature",
       y = "Feature")

Variables

top9 <- importance_frame %>%
  arrange(desc(accuracy_decrease)) %>%
  head(n=9) %>%
  select(variable) %>%
  mutate(variable = as.character(variable))

train %>%
  select(c(top9$variable, "y")) %>%
  gather(metric, value, -y) %>%
  mutate(Type = ifelse(y==1L,"Real News","Suspicious")) %>%
  ggplot(aes(x = value, fill = Type)) +
    geom_density(alpha = 0.4) +
    facet_wrap(~metric, scales = "free")

Explore Examples

z <- p %>%
  filter(DateFlag == "Train") %>%
  select(screen_name, n_capsp, n_exclaims, y) %>%
  gather(metric, value, -screen_name, -y) %>%
  mutate(Type = ifelse(y==1L, "Real News","Suspicious")) %>%
  ggplot(aes(x = metric, y = value, color = Type, text = screen_name)) +
  geom_jitter(height = 0, width = 0.2) +
  coord_flip()

plotly::ggplotly(z, tooltip = c("x","y","text"))

Top 3 Caps Accounts

filter(t, screen_name %in% c("govtslaves","aff_patriots", "henrymakow")) %>%
  select(text, created_at, screen_name) %>%
  split(.$screen_name) %>%
  head(n=10) 
## $aff_patriots
## # A tibble: 182 x 3
##    text                                    created_at          screen_name
##    <chr>                                   <dttm>              <chr>      
##  1 BREAKING: James Bond Star DEAD - https… 2017-05-23 14:46:38 aff_patrio…
##  2 BREAKING: SHOTS FIRED! - https://t.co/… 2017-05-23 16:14:28 aff_patrio…
##  3 BREAKING: Terrifying Details EMERGE Ab… 2017-05-23 17:19:19 aff_patrio…
##  4 BREAKING: Mad Dog Mattis Just Sent N. … 2017-05-23 17:28:09 aff_patrio…
##  5 Internet Absolutely ERUPTS After Every… 2017-05-23 18:04:33 aff_patrio…
##  6 CNN Goes TOO Far With What They Just D… 2017-05-23 19:02:21 aff_patrio…
##  7 BREAKING: America Just Launched MASSIV… 2017-05-23 20:28:53 aff_patrio…
##  8 Multiple DEAD After Suicide Bombing AT… 2017-05-23 20:27:18 aff_patrio…
##  9 BREAKING: ISIS Just EXPANDED To This C… 2017-05-23 21:18:35 aff_patrio…
## 10 BREAKING: New York On HIGH ALERT - htt… 2017-05-23 22:17:36 aff_patrio…
## # ... with 172 more rows
## 
## $govtslaves
## # A tibble: 520 x 3
##    text                                    created_at          screen_name
##    <chr>                                   <dttm>              <chr>      
##  1 HOW WILL BREXIT AFFECT THE UK ONLINE G… 2017-05-23 11:34:59 govtslaves 
##  2 HUGE BITCOIN CORRECTION NOW IMMINENT A… 2017-05-23 16:02:04 govtslaves 
##  3 JUDGE: IT’S OK IF BEST BUY’S GEEK SQUA… 2017-05-23 16:06:46 govtslaves 
##  4 OVER THE LAST 10 YEARS THE U.S. ECONOM… 2017-05-23 16:25:30 govtslaves 
##  5 JOE LIEBERMAN ATOP FBI WOULD BE A FIRS… 2017-05-23 17:22:58 govtslaves 
##  6 TRUMP’S SON-IN-LAW IS A BALTIMORE SLUM… 2017-05-23 19:52:35 govtslaves 
##  7 BLACK LIVES MATTER AWARDED 2017 SYDNEY… 2017-05-23 22:13:31 govtslaves 
##  8 GOOGLE STARTS TRACKING OFFLINE SHOPPIN… 2017-05-23 22:20:57 govtslaves 
##  9 THE BOMBINGS HAVE ONLY JUST BEGUN: “PR… 2017-05-23 22:30:42 govtslaves 
## 10 PAUL PLANS TO FORCE VOTE ON $110B SAUD… 2017-05-24 14:31:01 govtslaves 
## # ... with 510 more rows
## 
## $henrymakow
## # A tibble: 529 x 3
##    text                                    created_at          screen_name
##    <chr>                                   <dttm>              <chr>      
##  1 "RT @DrDavidDuke: 'The Rise Of The Goo… 2017-05-23 07:29:46 henrymakow 
##  2 "Netflix, Feminist suppress film that … 2017-05-23 07:24:08 henrymakow 
##  3 See the Masonic checkerboard on their … 2017-05-23 07:27:28 henrymakow 
##  4 This will continue until we all realiz… 2017-05-23 07:30:52 henrymakow 
##  5 RT @CollegeFix: Professor who called d… 2017-05-23 07:28:06 henrymakow 
##  6 Making Israel Great Again https://t.co… 2017-05-23 07:34:40 henrymakow 
##  7 "Assange accuser connected to CIA\nhtt… 2017-05-23 12:26:00 henrymakow 
##  8 "A new recruit.\nhttps://t.co/xV2H53BY… 2017-05-23 12:28:18 henrymakow 
##  9 "Whistleblower- Surgeon says Rich was … 2017-05-23 12:24:30 henrymakow 
## 10 "Democrat operative ran interference i… 2017-05-23 12:31:30 henrymakow 
## # ... with 519 more rows

Top 3 Exclamation Accts

filter(t, screen_name %in% c("unhealthytruth","100percfedup", "twitchyteam")) %>%
  select(text, created_at, screen_name) %>%
  filter(grepl('!', text)) %>% # keep only those with !'s
  split(.$screen_name) %>%
  head(n=10) 
## $`100percfedup`
## # A tibble: 231 x 3
##    text                                    created_at          screen_name
##    <chr>                                   <dttm>              <chr>      
##  1 "WATCH: Brave Christian Brits Attempt … 2017-05-23 04:17:07 100percfed…
##  2 WOW! HOLLYWOOD CONSERVATIVE Stands Up … 2017-05-23 07:00:03 100percfed…
##  3 New post: UNREAL! THE LEFT OUTRAGED AT… 2017-05-23 17:11:36 100percfed…
##  4 "UNREAL! THE LEFT OUTRAGED AT TRUMP'S … 2017-05-23 18:00:04 100percfed…
##  5 New post: DINGBAT NANCY STRIKES AGAIN!… 2017-05-23 18:14:16 100percfed…
##  6 RT @pat22372: NANCY STRIKES AGAIN! Wat… 2017-05-23 18:29:24 100percfed…
##  7 DINGBAT NANCY STRIKES AGAIN! Watch Nan… 2017-05-23 18:58:45 100percfed…
##  8 RT @seanhannity: IMPORTANT! Mediamatte… 2017-05-23 23:07:10 100percfed…
##  9 New post: BOOM! TREY GOWDY Hammers Ex-… 2017-05-23 23:26:46 100percfed…
## 10 "BOOM! TREY GOWDY Hammers Ex-CIA Chief… 2017-05-24 00:00:04 100percfed…
## # ... with 221 more rows
## 
## $twitchyteam
## # A tibble: 171 x 3
##    text                                    created_at          screen_name
##    <chr>                                   <dttm>              <chr>      
##  1 Wait, WHAT!? Wife of Bernie Sanders ha… 2017-05-23 04:25:31 twitchyteam
##  2 Cowards! AP censors ITSELF in weak hea… 2017-05-23 13:42:02 twitchyteam
##  3 ‘Look who now supports voter ID!’: Dem… 2017-05-23 15:07:10 twitchyteam
##  4 ICYMI ==&gt; Wait, WHAT!? Wife of Bern… 2017-05-23 15:19:06 twitchyteam
##  5 Move over InfoWars, here comes CNN! An… 2017-05-23 15:44:19 twitchyteam
##  6 ‘How were you RAISED?!’ Dana Loesch bl… 2017-05-23 16:17:47 twitchyteam
##  7 DAAAAAMN! John Kincade levels scumbag … 2017-05-23 16:38:59 twitchyteam
##  8 ‘Go BACK to the woods!’ Hillary’s Manc… 2017-05-23 18:38:22 twitchyteam
##  9 Cowards! AP censors ITSELF in weak hea… 2017-05-23 19:06:41 twitchyteam
## 10 ‘No sh*t, Sherlocks’! Prepare to be st… 2017-05-23 20:31:07 twitchyteam
## # ... with 161 more rows
## 
## $unhealthytruth
## # A tibble: 67 x 3
##    text                                    created_at          screen_name
##    <chr>                                   <dttm>              <chr>      
##  1 Huge victory!! https://t.co/4lScqdQGAP  2017-05-23 19:00:04 unhealthyt…
##  2 This is HUGE, people, and making mains… 2017-05-23 22:59:20 unhealthyt…
##  3 We are being censored. Watch for yours… 2017-05-24 00:13:15 unhealthyt…
##  4 I still cannot BELIEVE CNN did this st… 2017-05-24 02:38:28 unhealthyt…
##  5 Check it out! https://t.co/M6U0VoOqbW   2017-05-24 08:29:02 unhealthyt…
##  6 Good news! https://t.co/F6uxdiAXh6      2017-05-24 11:45:02 unhealthyt…
##  7 Good to know! https://t.co/wfRtev7Uhn   2017-05-24 22:29:04 unhealthyt…
##  8 Wow! : ) https://t.co/TjnkJGhCL0        2017-05-25 01:46:02 unhealthyt…
##  9 Check it out! https://t.co/fJTrFm10Bx   2017-05-25 12:07:03 unhealthyt…
## 10 Absolutely!! https://t.co/DrXdtrRFL1    2017-05-25 14:00:05 unhealthyt…
## # ... with 57 more rows

Explainer

# library(lime); library(caret)
# trainIndex <- createDataPartition(p$y, p = .8,
#                                  list = FALSE,
#                                  times = 1)
# 
# x_train <- p[trainIndex,] %>% select(-y, -screen_name)
# x_test <- p[-trainIndex,] %>% select(-y, -screen_name)
# 
# y_train <- p$y[trainIndex]
# y_test <- p$y[-trainIndex]
# 
# # Create Random Forest model on iris data
# model <- train(x_train, y_train, method = 'rf', localImp = TRUE)
# 
# train <- p[trainIndex,] %>% select(-screen_name)
# test <- p[-trainIndex,] %>% select(-screen_name)
# # Create an explainer object
# explainer <- lime(x_train, model)
# 
# # Explain new observation
# explanation <- explain(x_test, explainer, n_labels = 1, n_features = 2)
# 
# plot_features(explanation)

Index

n_chars = n_charS(text2),
n_commas = n_commas(text2),
n_digits = n_digits(text2),
n_exclaims = n_exclaims(text2),
n_extraspaces = n_extraspaces(text2),
n_hashtags = n_hashtags(text),
n_lowers = n_lowers(text2),
n_lowersp = (n_lowers + 1L) / (n_chars + 1L),
n_mentions = n_mentions(text),
n_periods = n_periods(text2),
n_urls = n_urls(text),
n_words = n_words(text2),
n_caps = n_caps(text2),
n_nonasciis = n_nonasciis(text2),
n_puncts = n_puncts(text2),
n_capsp = (n_caps + 1L) / (n_chars + 1L),
n_charsperword = (n_chars + 1L) / (n_words + 1L)
n_words <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  x <- gsub("\\d", "", x)
  x <- strsplit(x, "\\s+")
  x <- lengths(x)
  x[na] <- NA_integer_
  x
}

n_charS <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  x <- gsub("\\s", "", x)
  x <- nchar(x)
  x[na] <- NA_integer_
  x
}

n_digits <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  x <- nchar(gsub("\\D", "", x))
  x[na] <- NA_integer_
  x
}

n_hashtags <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  m <- gregexpr("#\\S+", x)
  x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
  x[na] <- NA_integer_
  x
}

n_mentions <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  m <- gregexpr("@\\S+", x)
  x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
  x[na] <- NA_integer_
  x
}

n_commas <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  m <- gregexpr(",+", x)
  x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
  x[na] <- NA_integer_
  x
}

n_periods <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  m <- gregexpr("\\.+", x)
  x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
  x[na] <- NA_integer_
  x
}

n_exclaims <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  m <- gregexpr("\\!+", x)
  x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
  x[na] <- NA_integer_
  x
}

n_extraspaces <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  m <- gregexpr("\\s{2,}|\\t|\\n", x)
  x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
  x[na] <- NA_integer_
  x
}

n_caps <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  m <- gregexpr("[[:upper:]]", x)
  x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
  x[na] <- NA_integer_
  x
}

n_lowers <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  m <- gregexpr("[[:lower:]]", x)
  x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
  x[na] <- NA_integer_
  x
}

n_urls <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  m <- gregexpr("https?:", x)
  x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
  x[na] <- NA_integer_
  x
}

n_nonasciis <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  x <- iconv(x, from = "UTF-8", to = "ASCII", sub = "[NONASCII]")
  m <- gregexpr("\\[NONASCII\\]", x)
  x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
  x[na] <- NA_integer_
  x
}

n_puncts <- function(x) {
  na <- is.na(x)
  if (all(na)) return(0)
  x <- gsub("!|\\.|\\,", "", x)
  m <- gregexpr("[[:punct:]]", x)
  x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
  x[na] <- NA_integer_
  x
}